10 years ago · 4d623c5893
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -23,14 +23,16 @@ module Agents
 
                 
              
 
                       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
              
 
                 
              
 
                -      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab.  An example:
              
 
                +      When parsing HTML or XML, these sub-hashes specify how each extraction should be done.  The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`.  It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string.  Here's an example:
              
 
                 
              
 
                           "extract": {
              
 
                -            "url": { "css": "#comic img", "attr": "src" },
              
 
                -            "title": { "css": "#comic img", "attr": "title" },
              
 
                -            "body_text": { "css": "div.main", "text": true }
              
 
                +            "url": { "css": "#comic img", "value": "@src" },
              
 
                +            "title": { "css": "#comic img", "value": "@title" },
              
 
                +            "body_text": { "css": "div.main", "value": "text()" }
              
 
                           }
              
 
                 
              
 
                +      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.
              
 
                +
              
 
                       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
              
 
                 
              
 
                           "extract": {
              
@@ -70,9 +72,9 @@ module Agents
 
                           'type' => "html",
              
 
                           'mode' => "on_change",
              
 
                           'extract' => {
              
 
                -            'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -            'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -            'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +            'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +            'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +            'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                           }
              
 
                       }
              
 
                     end
              
@@ -157,14 +159,11 @@ module Agents
 
                                   return
              
 
                                 end
              
 
                                 result = nodes.map { |node|
              
 
                -                  if extraction_details['attr']
              
 
                -                    node.attr(extraction_details['attr'])
              
 
                -                  elsif extraction_details['text']
              
 
                -                    node.text()
              
 
                -                  else
              
 
                -                    error '"attr" or "text" is required on HTML or XML extraction patterns'
              
 
                -                    return
              
 
                +                  value, = node.xpath(extraction_details['value'])
              
 
                +                  if value.is_a?(Float) && value.to_i == value
              
 
                +                    value = value.to_i
              
 
                                   end
              
 
                +                  value.to_s
              
 
                                 }
              
 
                                 log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
              
 
                               end
              
--- a/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
+++ b/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
@@ -0,0 +1,22 @@
 
                +class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
              
 
                +  def up
              
 
                +    Agent.where(type: 'Agents::WebsiteAgent').each do |agent|
              
 
                +      next if agent.extraction_type == 'json'
              
 
                +
              
 
                +      agent.options_will_change!
              
 
                +      agent.options['extract'].each { |name, extraction|
              
 
                +        case
              
 
                +        when extraction.delete('text')
              
 
                +          extraction['value'] = 'text()'
              
 
                +        when attr = extraction.delete('attr')
              
 
                +          extraction['value'] = "@#{attr}"
              
 
                +        end
              
 
                +      }
              
 
                +      agent.save!
              
 
                +    end
              
 
                +  end
              
 
                +
              
 
                +  def down
              
 
                +    raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration"
              
 
                +  end
              
 
                +end
              
--- a/spec/fixtures/agents.yml
+++ b/spec/fixtures/agents.yml
@@ -10,8 +10,8 @@ jane_website_agent:
 
                                  :expected_update_period_in_days => 2,
              
 
                                  :mode => :on_change,
              
 
                                  :extract => {
              
 
                -                     :title => {:css => "item title", :text => true},
              
 
                -                     :url => {:css => "item link", :text => true}
              
 
                +                     :title => {:css => "item title", :value => 'text()'},
              
 
                +                     :url => {:css => "item link", :value => 'text()'}
              
 
                                  }
              
 
                                }.to_json.inspect %>
              
 
                 
              
@@ -27,8 +27,8 @@ bob_website_agent:
 
                                  :expected_update_period_in_days => 2,
              
 
                                  :mode => :on_change,
              
 
                                  :extract => {
              
 
                -                   :url => {:css => "#comic img", :attr => "src"},
              
 
                -                   :title => {:css => "#comic img", :attr => "title"}
              
 
                +                   :url => {:css => "#comic img", :value => "@src"},
              
 
                +                   :title => {:css => "#comic img", :value => "@title"}
              
 
                                  }
              
 
                                }.to_json.inspect %>
              
 
                 
              
--- a/spec/models/agent_spec.rb
+++ b/spec/models/agent_spec.rb
@@ -768,8 +768,8 @@ describe AgentDrop do
 
                         url: 'http://dilbert.com/',
              
 
                         mode: 'on_change',
              
 
                         extract: {
              
 
                -          url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
              
 
                -          title: { css: '.STR_DateStrip', text: true },
              
 
                +          url: { css: '[id^=strip_enlarged_] img', value: '@src' },
              
 
                +          title: { css: '.STR_DateStrip', value: 'text()' },
              
 
                         },
              
 
                       },
              
 
                       schedule: 'every_12h',
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do
 
                         'url' => "http://xkcd.com",
              
 
                         'mode' => 'on_change',
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -          'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +          'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                         }
              
 
                       }
              
 
                       @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2)
              
@@ -256,8 +256,8 @@ describe Agents::WebsiteAgent do
 
                           'url' => "http://xkcd.com",
              
 
                           'mode' => "on_change",
              
 
                           'extract' => {
              
 
                -            'url' => {'css' => "#topLeft a", 'attr' => "href"},
              
 
                -            'title' => {'css' => "#topLeft a", 'text' => "true"}
              
 
                +            'url' => {'css' => "#topLeft a", 'value' => "@href"},
              
 
                +            'title' => {'css' => "#topLeft a", 'value' => "text()"}
              
 
                           }
              
 
                         }
              
 
                         rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
              
@@ -389,9 +389,9 @@ describe Agents::WebsiteAgent do
 
                         'url' => "http://www.example.com",
              
 
                         'mode' => 'on_change',
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -          'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +          'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                         },
              
 
                         'basic_auth' => "user:pass"
              
 
                       }
              
@@ -421,7 +421,7 @@ describe Agents::WebsiteAgent do
 
                         'mode' => 'on_change',
              
 
                         'headers' => { 'foo' => 'bar' },
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                         }
              
 
                       }
              
 
                       @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)